
function [newcorp,newauth] = unskew (profiles,authors)
%takes a list of cells containing author profiles, and copies random
%documents from under represented authors until they are on par with the
%most occuring author. if an author has less than half the documents than
%another all documents will be copied and any runners up drawn randomly
%from the opriginal corpus.
%

%determine the max number of docs per author
uniqueA=unique(authors)';
tmax=0;
for i=uniqueA
    tmax=  max(tmax,sum(authors==i,1));
end

%oversample the ones with to few documents.
newcorp=profiles;
newauth=authors;
for i=uniqueA
    need = tmax - sum(authors==i); %number of documents short.
    tmpaidx=find(authors==i);
    while need >= sum(authors==i)
        idxs=tmpaidx(randperm (size(tmpaidx,1)));
        newcorp=[newcorp;profiles(idxs(:),:)]; %duplicating entries
        newauth=[newauth;authors(idxs(:))];
        need=need-sum(authors==i);
    end
    
    while need >0
        idxs=tmpaidx(randperm (size(tmpaidx,1)));
        newcorp=[newcorp;profiles(idxs(1),:)]; %duplicating entries
        newauth=[newauth;authors(idxs(1))];
        need=need-1;
    end
    
end


